This notebook contains a generative process for a multi-omic regulatory network and a simulated dataset of genetic perturbation experiments.
Why is this helpful? - The generated dataset can be trained to infer cause->effect relationships from dynamics that can then be compared to the true relationships in the regulatory network. - We can mask certain molecular species (e.g., metabolites or proteins) so that they serve as unmeasured confounders. This can be used to evaluate causal inference methods sensitivity to mis-specification and can help to inform the overall value of collecting multiomic data (i.e., directly measure the metabolites and/or proteins).
A multi-omic regulatory network contains multiple different classes of molecules and a set of directed edges between regulators and targets reflecting causal relationships. In a single ’omic model we want to treat all species equivalently, but in a multiomic model it makes sense to mold the relationships between molecules to better reflect known molecular biology.
Factors to include a general simulation:
To create a network featuring such interactions, we can start with a Barabasi-Albert model. In this model, molecular species are added one at a time and each time a new species is added it creates N connections to existing species. Since species added earlier will have more chances of being selected by newly added species they become more highly connected, exhibiting preferential attachment. This results in an overall degree distribution which follows a power-law distribution (to make the physicists happy).
Updating this model so it conforms to the multiomics rules above can be done by pruning edges based on the overall feasibility of cross-class interactions and adding edges between all RNAs and the protein they encode.
To simulate dynamics from this network we can write ODEs for each molecular species where all regulators of a given species are represented as activators or inhibitors. Without care such ODEs can easily explode to \(\infty\) or drop to \(\le 0\). To avoid this, we can use saturable rate equations like convenience kinetics, along with basal synthesis rates, and first order decay. This allows us to find the derivative of each species based on the current levels of all species and causal effect sizes.
Iterating this system forward finds a steady-state of stable molecule abundances. We can then increase the basal synthesis of any given molecular species to create a synthetic induction experiment.
library(dplyr)
library(ggplot2)
library(ggraph) # to plot network elements
library(deSolve) # for solving the system of differential equations
set.seed(1234)
# parameters of the simulation
### Defining connectivity
# # of genes
N_GENES <- 100
# relative fraction of metabolites w.r.t. # of genes
METAB_FRAC <- 0.3
### Defining ODEs
# average effects size between a cause and effect
MEAN_EFFECT_SIZE <- 1
# basal production rate (unaffected by regulation)
BASAL_PROD <- 0.2
# first order decay rate
LOSS_RATE <- 0.4
### Defining ODE simulation
# how long to simulate to find an initial steady state
MAX_TIME <- 3000
# step size for ODE updates
TIME_RESOLUTION <- 0.1
n_metabolites <- N_GENES * METAB_FRAC
n_species <- N_GENES * 2 + n_metabolites # genes are doubled up because: 1 transcript + 1 protein
net_connections <- igraph::sample_pa(n_species, power = 1, m = 3)
# summarize degrees
edge_counts <- igraph::degree(net_connections) %>%
table() %>%
{tibble::tibble(degree = names(.),
count = unname(.))} %>%
dplyr::mutate_all(as.integer)
ggplot(edge_counts, aes(x = degree, y = count)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
ggtitle("Degree distribution of Barabasi-Albert network", subtitle = "Yay powerlaws") +
theme_bw()
species_init <- dplyr::bind_rows(
tibble::tibble(gene_id = 1:N_GENES,
species_type = "transcript"),
tibble::tibble(gene_id = 1:N_GENES,
species_type = "protein"),
tibble::tibble(gene_id = NA_integer_,
species_type = rep("metabolite", n_metabolites))
) %>%
dplyr::mutate(species_id = sample(1:n_species))
# edge_plausibility
# edge_plausibility <- tibble::tribble(~ from_type, ~ to_type, ~ interaction_type, ~ plausibility,
# "transcript", "transcript", NA_character_, 0,
# "transcript", "protein", NA_character_, 0,
# "transcript", "metabolite", NA_character_, 0,
# "protein", "transcript", "TF", 1,
# "protein", "protein", "PPI", 0.5,
# "protein", "metabolite", "enzyme", 1,
# "metabolite", "transcript", "MDI", 0.2,
# "metabolite", "protein", "MPI", 1,
# "metabolite", "metabolite", "interconversion", 1)
edge_plausibility <- tibble::tribble(~ from_type, ~ to_type, ~ interaction_type, ~ plausibility,
"transcript", "transcript", "transcript -> transcript", 1,
"transcript", "protein", "transcript -> protein", 1,
"transcript", "metabolite", "transcript -> metabolite", 1,
"protein", "transcript", "TF", 1,
"protein", "protein", "PPI", 1,
"protein", "metabolite", "enzyme", 1,
"metabolite", "transcript", "MDI", 1,
"metabolite", "protein", "MPI", 1,
"metabolite", "metabolite", "interconversion", 1)
edges_df <- igraph::as_data_frame(net_connections) %>%
tibble::as_tibble() %>%
dplyr::rename(from = to, to = from) %>%
dplyr::mutate_all(as.integer) %>%
dplyr::left_join(species_init %>%
dplyr::select(from = species_id, from_type = species_type),
by = "from") %>%
dplyr::left_join(species_init %>%
dplyr::select(to = species_id, to_type = species_type),
by = "to") %>%
dplyr::left_join(edge_plausibility, by = c("from_type", "to_type")) %>%
# cull edges based on plausibility
dplyr::filter(plausibility != 0) %>%
dplyr::group_by(from, interaction_type) %>%
dplyr::mutate(plausible = sample(c(TRUE, FALSE), size = 1, prob = c(plausibility[1], 1-plausibility[1]))) %>%
dplyr::ungroup() %>%
dplyr::filter(plausible) %>%
dplyr::select(from, to, interaction_type) %>%
# add cognate edges
dplyr::bind_rows(
species_init %>%
dplyr::filter(species_type != "metabolite") %>%
tidyr::spread(species_type, species_id) %>%
dplyr::select(from = transcript, to = protein) %>%
dplyr::mutate(interaction_type = "cognate")
)
species_df = species_init %>%
dplyr::select(species_id, species_type) %>%
dplyr::arrange(species_id) %>%
# add some global properties which can be configured to be feature-specific
dplyr::mutate(
basal_prod = BASAL_PROD,
loss_rate = LOSS_RATE
)
# network summaries
pruned_network <- igraph::graph_from_data_frame(edges_df,
directed = TRUE,
vertices = species_df)
work_with_top_cluster <- FALSE
if (work_with_top_cluster) {
network_clusters <- igraph::clusters(pruned_network)
isolated_vertices <- names(network_clusters$membership)[network_clusters$membership != which.max(network_clusters$csize)]
pruned_network <- igraph::delete_vertices(pruned_network, isolated_vertices)
}
ggraph(pruned_network, layout = "fr") +
geom_edge_link(arrow = arrow(type = "closed", length = unit(0.07, "inches")), color = "gray50", alpha = 0.1) +
geom_node_point(aes(color = species_type)) +
scale_color_brewer("Species Type", palette = "Set2") +
theme_void() +
theme(legend.position = "bottom") +
ggtitle("Simulated multi-omic regulatory network")
edges_df %>%
dplyr::count(interaction_type) %>%
knitr::kable(caption = "Types of interactions in regulatory network") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE, position = "left")
| interaction_type | n |
|---|---|
| cognate | 100 |
| enzyme | 39 |
| interconversion | 12 |
| MDI | 28 |
| MPI | 23 |
| PPI | 132 |
| TF | 130 |
| transcript -> metabolite | 39 |
| transcript -> protein | 142 |
| transcript -> transcript | 139 |
To represent the regulatory network as a series of differential equations, we need an expression relating the rate of change of a query species to the levels of all of its regulators. Such an expression needs to work for 0+ regulators and also produce biologically feasible dynamics (zero molecules shoot to \(\infty\) and molecules ideally shouldn’t go to zero either). We can represent how a biochemical process is impacted by an arbitrary set of regulators using convience kinetics.
For systems that are just impacted by activation and inhibition (i.e., ignoring substrate <-> product interconversion), such an expression will look like:
\[ \frac{dS_{k}}{dt} = \prod_{i \in \text{inhibitors}} \frac{k_i}{k_{i} + S_i}\prod_{a \in \text{activators}} \left( 1 + \frac{S_{a}}{k_{a}}\right) \] Here, \(S_{k}\) is the regulated species, \(i\)s index inhibitors and \(a\)s index activators. \(k_{i}\) and \(k_{a}\) are affinity constants reflecting half-max inhibition concentrations, and double activity concentrations respectively. (Note that this an atypical expression for activation but its nice here because it results in values > 1 while the alternative (S/(S + Ka)) results in values < 1.)
Convenience kinetics allows us to define how a process scales with regulators but to improve its behavior we can add basal synthesis and first-order decay terms. Basal synthesis will keep a molecules concentration from dropping to zero. First-order decay (or alternatively washout) decreases concentration proportionally to the current concentration thereby keeping concentrations from shooting to \(\infty\) (Note that this is still possible if a strong autoactivating circuit is created but this can be monitored or avoided by enforcing max rate.)
Adding basal synthesis and washout terms to convenience kinetics yields the final ODE:
\[ \frac{dS_{k}}{dt} = \alpha + \prod_{i \in \text{inhibitors}} \frac{k_i}{k_{i} + S_i}\prod_{a \in \text{activators}} \left( 1 + \frac{S_{a}}{k_{a}}\right) - \lambda S_{k} \] ## Finding a pre-induction steady state
With these ODEs and initial concentrations of all molecular species (here set at 1) we can simulate the time evolution of the system. If we did a good job in defining the ODEs they should reach a steady-state which can be treated analagously to a pre-induction IDEA chemostat culture.
reffect <- function (n, mean = 0.5, shape = 10) {
magnitudes <- rgamma(n, shape = shape, scale = mean / shape)
#effect <- magnitudes * (rbinom(n, 1, 0.5) - 0.5)*2
}
Deffects <- function(time, states, pars, species_df, time_resolution = 0.1) {
# takes the args of ode() and returns derivatives
# aligned with "states"
reg_states <- tibble::tibble(value = states) %>%
dplyr::mutate(species_id = 1:n()) %>%
dplyr::left_join(species_df, by = "species_id")
modulated <- pars %>%
dplyr::left_join(reg_states, by = c("from" = "species_id")) %>%
dplyr::mutate(rel_rate = dplyr::case_when(
act_or_inh == "inhibitor" ~ dissoc_constant / (dissoc_constant + value),
# force activation scaling to max out at 5X to avoid
act_or_inh == "activator" ~ pmin(1 + value / dissoc_constant, 10),
)) %>%
dplyr::group_by(to) %>%
dplyr::summarize(rate = prod(rel_rate))
reg_states <- reg_states %>%
dplyr::left_join(modulated, by = c("species_id" = "to")) %>%
dplyr::mutate(rate = ifelse(is.na(rate), 1, rate),
loss = -1 * value * loss_rate,
# basal production + variable production + washout
# scale to time resolution / step size
deriv = (basal_prod + rate + loss)*time_resolution)
state_derivs <- list(reg_states$deriv)
return (state_derivs)
}
states <- rep(1, nrow(species_df))
#reffect(1000, mean = MEAN_EFFECT_SIZE, shape = 1) %>%
# hist(breaks = 100)
pars <- edges_df %>%
dplyr::mutate(
# find an effect size and whether a relationship is activating or inhibiting
# TO DO - the effect size isn't used anymore
dissoc_constant = reffect(n(), mean = MEAN_EFFECT_SIZE, shape = 1),
act_or_inh = c("activator", "inhibitor")[rbinom(dplyr::n(), 1, 0.5)+1],
act_or_inh = ifelse(interaction_type == "cognate", "activator", act_or_inh),
)
# since production and degradation rates are not balanced initially
# we can simulate from initial conditions to find pre-perturbation initial steady state
ode_sim <- ode(
y = states,
times = seq(0, MAX_TIME, by = TIME_RESOLUTION),
func = Deffects,
parms = pars,
species_df = species_df,
time_resolution = TIME_RESOLUTION
)
simulated_dynamics <- ode_sim %>%
as.data.frame() %>%
tidyr::gather(species_id, value, -time) %>%
tibble::as_tibble() %>%
dplyr::mutate(species_id = as.integer(species_id))
simulated_dynamics %>%
dplyr::semi_join(
simulated_dynamics %>%
dplyr::distinct(species_id) %>%
dplyr::sample_n(12),
by = "species_id"
) %>%
ggplot(aes(x = time, y = value)) +
geom_path() +
facet_wrap(~ species_id, scale = "free_y") +
ggtitle("Simulation from initial conditions where concenrations are all 1 to find a stable steady-state")
# check for a steady state
time_derivatives <- simulated_dynamics %>%
dplyr::filter(time > MAX_TIME - 1) %>%
dplyr::arrange(time) %>%
tidyr::nest(vals = -species_id) %>%
dplyr::mutate(assym_summary = purrr::map(
vals,
function(x) {
tibble::tibble(
assym_value = mean(x$value),
# check differences between adjacent time steps
mean_D = mean(diff(x$value))
)
}
)) %>%
dplyr::select(-vals) %>%
tidyr::unnest(assym_summary)
stopifnot(
all(!is.na(time_derivatives$mean_D)),
all(abs(time_derivatives$mean_D) < 1e-5)
)
To create each perturbation timecourse we can:
# now lets separately perturb each species
simulate_perturbation <- function (perturbed_species_id, time_derivatives, species_df) {
checkmate::assertInteger(perturbed_species_id)
checkmate::assertChoice(perturbed_species_id, species_df$species_id)
perturbed_species_df <- species_df %>%
dplyr::mutate(basal_prod = ifelse(species_id == perturbed_species_id, basal_prod*20, basal_prod))
perturbation_sim <- ode(
y = time_derivatives$assym_value,
times = seq(0, 300, by = 0.5),
func = Deffects,
parms = pars,
species_df = perturbed_species_df
) %>%
as.data.frame() %>%
tidyr::gather(species_id, value, -time) %>%
tibble::as_tibble() %>%
dplyr::mutate(species_id = as.integer(species_id)) %>%
dplyr::group_by(species_id) %>%
# calculate fold-change relative to the t0 s.s.
dplyr::mutate(fold_change = value / value[time == 0]) %>%
dplyr::ungroup()
}
all_perturbation_timecourses <- time_derivatives %>%
dplyr::distinct(perturbed_species_id = species_id) %>%
# only look at a few experiments for now to tune parameters
#dplyr::slice(1:20) %>%
dplyr::mutate(perturbation_results = purrr::map(
perturbed_species_id,
simulate_perturbation,
time_derivatives,
species_df
)
)
all_perturbation_timecourses %>%
tidyr::unnest(perturbation_results) %>%
dplyr::inner_join(
pars %>% dplyr::select(from, to, act_or_inh),
by = c("perturbed_species_id" = "from", "species_id" = "to")
) %>%
dplyr::mutate(tc_id = glue::glue("{perturbed_species_id}_{species_id}")) %>%
ggplot(aes(x = time, y = fold_change, group = tc_id, color = act_or_inh)) +
geom_path() +
ggtitle("Direct activation / inhibition") +
theme_bw()
all_perturbation_timecourses %>%
dplyr::slice(1:10) %>%
#dplyr::sample_n(10) %>%
tidyr::unnest(perturbation_results) %>%
dplyr::mutate(tc_id = glue::glue("{perturbed_species_id}_{species_id}")) %>%
ggplot(aes(x = time, y = fold_change, group = tc_id)) +
geom_path() +
facet_wrap(~ perturbed_species_id, scale = "free_y") +
ggtitle("Random perturbation experiments") +
theme_bw()
perturbation_dfs <- all_perturbation_timecourses %>%
tidyr::unnest(perturbation_results)
readr::write_tsv(perturbation_dfs, "~/Desktop/synthetic_idea_timecourses")
readr::write_tsv(pars, "~/Desktop/synthetic_idea_parameters")